home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
ShareWare OnLine 2
/
ShareWare OnLine Volume 2 (CMS Software)(1993).iso
/
infor
/
ddj_more.zip
/
TOOLS.C
< prev
next >
Wrap
Text File
|
1993-01-04
|
17KB
|
800 lines
/*-----------------------------------------------------------------------
*
* TOOLS.C: The expression parser used by GREP
*
*
* Copyright (c) 1984 Allen Holub
*
* Permission to copy this program or any part of this program
* is granted in the case of personal, non-commercial use only.
* Any use for profit or other commercial gain without written
* permission of the author is prohibited.
*
* If you've been given this program by a friend, and you find
* it worthwhile, I'd appreciate your sending $35 to me for the
* program.
*
* Software Engineering Consultants, P.O. Box 5679, Berkeley CA, 94705
*
*-----------------------------------------------------------------------
*/
/*
* This module contains the various routines needed by grep
* to match regular expressions. Routines are ordered
* alphabetically.
*/
#include <stdio.h>
#include "tools.h"
char *amatch( lin, pat, boln)
char *lin, *boln;
TOKEN *pat;
{
/* Scans through the pattern template looking for a match
* with lin. Each element of lin is compared with the template
* until either a mis-match is found or the end of the template
* is reached. In the former case a 0 is returned; in the latter,
* a pointer into lin (pointing to the last character in the
* matched pattern) is returned.
*
* "lin" is a pointer to the line being searched.
* "pat" is a pointer to a template made by makepat().
* "boln" is a pointer into "lin" which points at the
* character at the beginning of line.
*/
register char *bocl, *rval, *strstart;
if (pat == 0)
return ( (char*)0 );
strstart = lin;
while ( pat )
{
if (pat->tok == CLOSURE && pat->next)
{
/*
* Process a closure:
* First skip over the closure token to the
* object to be repeated. This object can be
* a character class.
*/
pat = pat->next;
/* Now match as many occurrences of the
* closure pattern as possible.
*/
bocl = lin;
while ( *lin && omatch(&lin, pat, boln) )
;
/* 'Lin' now points to the character that
* made us fail. Now go on to process the
* rest of the string. A problem here is
* a character following the closure which
* could have been in the closure.
* For example, in the pattern "[a-z]*t" (which
* matches any lower-case word ending in a t),
* the final 't' will be sucked up in the while
* loop. So, if the match fails, we back up a
* notch and try to match the rest of the
* string again, repeating this process
* recursively until we get back to the
* beginning of the closure. The recusion
* goes, at most, two levels deep.
*/
if (pat = pat->next)
{
while (bocl <= lin )
{
if (rval = amatch(lin, pat, boln) )
{
/* success */
return(rval);
}
else
--lin;
}
return (0); /* match failed */
}
}
else if ( omatch(&lin, pat, boln) )
{
pat = pat->next;
}
else
{
return (0);
}
}
/*
* Note that omatch() advances lin to point at the next
* character to be matched. Consequently, when we reach
* the end of the template, lin will be pointing at the
* character following the last character matched.
* the exceptions are templates containing only a
* BOLN or EOLN token. In these cases omatch doesn't
* advance.
*
* So, decrement lin to make it point at the end of the
* matched string. Then, check to make sure that we haven't
* decremented past the beginning of the string.
*
* A philosophical pint should be mentioned here. Is $
* a position or a character? (i.e., does $ mean the EOL
* character itself or does it mean the character at the end of
* the line.) I decided here to make it mean the former, in
* order to make the behavior of amatch() consistent. If you
* give amatch a pattern ^$ (amtch all lines consisting only
* of an end of line) then, since something has to be returned,
* a pointer to the end of line character itself is returned.
*
* The --lin is done outside the return statement because max()
* is often a macro (which has side-effects).
*/
--lin;
return ( max(strstart, lin) );
}
/*---------------------------------------------------------------------*/
#ifdef DEBUG
delete( ch, str )
int ch;
register char *str;
{
/* Delete the first occurrence of character from string
* moving everything else over a notch to fill the hole.
*/
ch &= 0xff;
while ( *str && *str != ch)
str++;
while ( *str )
{
*str = *(str + 1);
str++;
}
}
#endif
/*----------------------------------------------------------------------*/
setbit( c, field )
int c;
char field[];
{
/* Set a bit in the bit ASCII bit map corresponding to the
* character c. Field must be at least 16 bytes long.
*/
field[ (c & 0x7f) >> 3 ] |= 1 << (c & 0x07) ;
}
/*---------------------------------------------------------------------*/
testbit( c, field)
int c;
char field[];
{
/* See if the bit corresponding to c in field is set.
*/
return ( field[ (c & 0x7f) >> 3] & (1 << (c & 0x07)) );
}
/*----------------------------------------------------------------------*/
char *dodash( delim, src, map)
int delim;
char *src, *map;
{
/* Expand the set pointed to by "*src" into the bitmap "map."
* Stop at delim or end of string. Update *src to point
* at terminator. A set can have one element {x} or several
* elements ( {abcdefghijklmnopqrstuvwxyz} and {a-z}
* are equivalent ). Note that the dash notation is expanded
* as sequential numbers. This means (since we are using the
* ASCII character set) that a-Z will contain the entire alphabet
* plus the symbols: [\]^_'
*
* The character classes are stored in a 16 byte wide bit field
* where each bit corresponds to an ASCII character
*/
register int first, last;
char *start;
start = src;
while( *src && *src != delim)
{
if( *src != '-')
setbit( esc( &src ), map );
else if( src == start || *(src + 1) == delim )
setbit ('-', map );
else{
src++;
if( *src < *(src - 2) )
{
first = *src;
last = *(src - 2);
}
else
{
first = *(src - 2);
last = *src;
}
while( ++first <= last )
setbit( first, map );
src++;
}
}
return( src );
}
/*----------------------------------------------------------------------*/
int esc(s)
char **s;
{
/* Map escape sequences into their equivalent symbols. Returns the
* Correct ASCII character. c is the character following the \.
*/
register int rval;
if( **s != ESCAPE )
rval = *( (*s)++ );
else
{
(*s)++;
switch( toupper(**s) )
{
case '\0': rval = ESCAPE; break;
case 'B': rval = '\b'; break;
case 'F': rval = '\f'; break;
case 'N': rval = '\n'; break;
case 'R': rval = '\r'; break;
case 'S': rval = ' '; break;
case 'T': rval = '\t'; break;
default: rval = **s; break;
}
(*s)++;
}
return (rval);
}
/*----------------------------------------------------------------------*/
TOKEN *getpat( arg )
char *arg;
{
/* Translate arg into a TOKEN string
*/
return ( makepat(arg, '\000' ) );
}
/*----------------------------------------------------------------------*/
#ifdef DEGUG
int insert( ch, str )
int ch;
register char *str;
{
/* Insert ch into str at the place pointed to by str. Move
* everything else over a notch
*/
register char *bp;
bp = str;
while (*str) /* Find the end of the string */
str++;
do /* Move the tail over one notch */
{
*(str + 1) = *str;
str--;
} while (str >= bp);
*bp = ch; /* Put the char in the hole. */
}
#endif
/*----------------------------------------------------------------------*/
int isalphanum(c)
int c;
{
/* Return true if c is a alphabetic character or digit,
* flase otherwise.
*/
return ( ('a' <= c && c <= 'z') ||
('A' <= c && c <= 'Z') ||
('0' <= c && c <= '9')
);
}
/*----------------------------------------------------------------------*/
TOKEN *makepat(arg, delim)
char *arg;
int delim;
{
/* Make a pattern template from the string pointed to by arg.
* Stop when delim or '\000' or '\n' is found in arg.
* Return a pointer to the pattern template.
*
* The pattern templates used here are somewhat different
* than those ued in the book; each token is a structure
* of the form TOKEN (see tools.h). A token consists of
* an identifer, a ponter to a string, a literal
* characte and a ponter to another token. This last is 0 if
* there is no subsequent token.
*
* The one strangemess here is caused (again) by CLOSURE which
* has to be put in front of the previous token. To make this
* insertion a little easier, the 'nex' field of the last
* token in the chain (the one pointed to by 'tail') is make
* to point at the previous node. When we are finished,
* tail->next is set ot 0.
*/
TOKEN *head, *tail;
TOKEN *ntok;
int error, i;
/* Check for characters that aren't legal at the beginning
* of a template.
*/
if (*arg == '\0' || *arg == delim || *arg == '\n' || *arg == CLOSURE)
return(0);
error = 0;
head = 0;
tail = 0;
for(; *arg && *arg != delim && *arg != '\n' && !error; arg++)
{
ntok = (TOKEN *) malloc(TOKSIZE);
if( error = !ntok )
{
fprintf(stderr,
"Not enough memory for pattern template\n");
break;
}
switch(*arg)
{
case ANY:
ntok->tok = ANY;
break;
case BOL:
if (head == 0) /* then this is the first symbol */
ntok->tok = BOL;
else
error = 1;
break;
case EOL:
if ( *(arg + 1) == delim || *(arg + 1) == '\0'
|| *(arg + 1) == '\n' )
ntok->tok = EOL;
else
error = 1;
break;
case CCL:
if (*(arg + 1) == NEGATE)
{
ntok->tok = NCCL;
arg += 2;
}
else
{
ntok->tok = CCL;
arg++;
}
if ( ntok->bitmap = (char *) calloc( 16, 1 ) )
{
arg = dodash(CCLEND, arg, ntok->bitmap );
}
else
{
fprintf(stderr,"Not enough memory for pat\n");
error = 1;
}
break;
case CLOSURE:
switch ( tail->tok )
{
case BOL:
case EOL:
case CLOSURE:
return(0);
default:
ntok->tok = CLOSURE;
}
break;
default:
ntok->tok = LITCHAR;
ntok->lchar = esc( &arg );
--arg; /* esc advances us past the character */
}
if ( error )
{
unmakepat(head);
return(0);
}
else if (head == 0)
{
/* This is the first node in the chain.
*/
ntok->next = 0;
head = tail = ntok;
}
else if( ntok->tok != CLOSURE)
{
/* Insert at end of list (after tail) */
tail->next = ntok;
ntok->next = tail;
tail = ntok;
}
else if( head != tail )
{
/* More than one node in the chain. Insert the
* CLOSURE node immediately in from of tail.
*/
(tail->next)->next = ntok;
ntok->next = tail;
}
else
{
/* Only one node in the chain, Insert the CLOSURE
* node at the head of the linked list.
*/
ntok->next = head;
tail->next = ntok;
head = ntok;
}
}
tail->next = 0;
return (head);
}
/*----------------------------------------------------------------------*/
char *matchs(line, pat, ret_endp)
char *line;
TOKEN *pat;
int ret_endp;
{
/*
* Compares line and pattern. Line is a character string while
* pat is a pattern template made by getpat().
* Returns:
* 1. A zero of no match was found.
* 2. A pointer to the last character satisfying the
* match if ret_endp is non-zero.
* 3. A pointer to the beginning of the matched string
* if ret_endp is 0;
*
* For example:
*
* matchs ("1234567890", getpat("4[0-9]*7"),0);
*
* will return a pointer to the '4', while
*
* matchs ("1234567890", getpat("4[0-9]*7"),1);
*
* will return a pointer to the '7'.
*/
char *rval, *bptr;
bptr = line;
while (*line)
{
if ( (rval = amatch(line, pat, bptr)) == 0)
{
line++;
}
else
{
rval = ret_endp ? rval : line ;
break;
}
}
return (rval);
}
/*----------------------------------------------------------------------*/
char *stoupper(str)
char *str;
{
/*
* Map the entire string pointed to by str to upper case.
* Return str.
*/
char *rval;
rval = str;
while (*str)
{
if ( 'a' <= *str && *str <= 'z' )
*str -= ('a' - 'A');
str++;
}
return (rval);
}
/*----------------------------------------------------------------------*/
int omatch (linp, pat, boln)
char **linp, *boln;
TOKEN *pat;
{
/* Match one pattern element, pointed at by pat, with the
* character at **limp. Return non-zero on match.
* Otherwise, return 0. *limp is advanced to skip over the
* matched character; it is not advanced on failure. The
* amound of the advance is 0 for patterns that match null
* strings, 1 otherwise. "boln" should point at the position
* that will match a BOL token.
*/
register int advance;
advance = -1;
if ( **linp )
{
switch ( pat->tok )
{
case LITCHAR:
if ( **linp == pat->lchar )
advance = 1;
break;
case BOL:
if ( *linp == boln )
advance = 0;
break;
case ANY:
if ( **linp != '\n' )
advance = 1;
break;
case EOL:
if ( **linp == '\n' )
advance = 0;
break;
case CCL:
if ( testbit( **linp, pat->bitmap ) )
advance = 1;
break;
case NCCL:
if ( !testbit( **linp, pat->bitmap) )
advance = 1;
break;
default:
printf("omatch: can't happen\n");
}
}
if (advance >= 0)
*linp += advance;
return ( ++advance );
}
/*----------------------------------------------------------------------*/
#ifdef DEBUG
int pr_line(ln)
register char *ln;
{
/* Print out Ln, if a non-printing character is found, print
* out its numerical value in the form "\0<hex number>".
* Again, this is a debugging aid. It lets you see what's
* really on the line.
*/
for ( ; *ln ; ln++)
{
if ( (' ' <= *ln) && (*ln <= '~') )
putchar(*ln);
else
{
printf("\\0x%02x", *ln);
if (*ln == '\n')
putchar('\n');
}
}
}
/*----------------------------------------------------------------------*/
int pr_tok( head )
TOKEN *head;
{
/* Print out the pattern template (linked list of TOKENs)
* pointed to by head. This is a useful debugging aid. Note
* that pr_tok() just scans along the linked list, terminating
* on a null pointer; so, you can't use pr_tok from inside
* makepat() because tail->next points to the previous
* node instead of being null. Note that NEGATE or OR_SYM
* are not listed because they won't occur in a template.
*/
register char *str;
register int i;
for ( ; head ; head = head->next )
{
switch(head->tok)
{
case BOL:
str = "BOL";
break;
case EOL:
str = "EOL";
break;
case ANY:
str = "ANY";
break;
case LITCHAR:
str = "LITCHAR";
break;
case ESCAPE:
str = "ESCAPE";
break;
case CCL:
str = "CCL";
break;
case CCLEND:
str = "CCLEND";
break;
case NCCL:
str = "NCCL";
break;
case CLOSURE:
str = "CLOSURE";
break;
default:
str = "**** unknown ****";
}
printf("%-8s at: 0x%x,", str, head);
if (head->tok == CCL || head->tok == NCCL)
{
printf("string (at 0x%x) =<", head->bitmap );
for ( i = 0; i < 0x7f ; i++)
if ( testbit( i, head->bitmap) )
putchar(i);
printf(">, ");
}
else if (head->tok == LITCHAR)
printf("lchar = %c, ", head->lchar);
printf("next = 0x%x\n", head->next);
}
putchar('\n');
}
#endif
/*----------------------------------------------------------------------*/
int unmakepat(head)
TOKEN *head;
{
/* Free up the memory used for the token string */
register TOKEN *old_head;
while (head)
{
switch (head->tok)
{
case CCL:
case NCCL:
free(head->bitmap);
/* no break, fall through to default */
default:
old_head = head;
head = head->next;
free(old_head);
break;
}
}
}
/*----------------------------------------------------------------------*/
char *index( c, str )
char *str;
{
/* Return true if c is in str.
*/
while ( *str )
if ( *str++ == c )
return str;
return 0;
}